This example uses simulated data to demonstrate how to include Date as a continuous regressor and Site as a discrete regressor for Etiology and False Positive Rate (FPR) in conditional independence model.

rm(list=ls())
library(baker)
library(lubridate)
library(rjags)
library(mgcv)

Data Simulation

The function simulate_nplcm() can simulate data for conditional independence model by setting K = 2 and control/case subclass weights to c(1,0). Other parameter settings in this example include:

In this example, we index the two sites as 1, 2 and assume dates varying from 1 to 300. Suppose we have 300 cases and 200 controls for each site. For each subject, date is randomly sampled from 1 to 300. The corresponding etiology probability (\(\pi\)) and FPR for MBS1 (\(\psi\)) are calculated according to the following additive regression models. The regression parameters for Etiology regression \(\beta_{ij}\) for \(i=0,1,2,3\) and \(j=1,...,J\) is specified in Eti.reg.paraMat.list. The regression parameters for FPR regression \(\alpha_{ij}\) for \(i=0,1,2,3\) and \(j=1,...,J\) is specified in FPR.reg.paraMat.list.

\[log\phi_j=\beta_{0j}+\beta_{1j}\mathbb{1}\{Site=1\}+\beta_{2j}(Date-150)+\beta_{3j}(Date-150)^2, \space \pi_j=\frac{\phi_j}{\sum_{k=1}^J\phi_k}\] \[\psi_j=\alpha_{0j}+\alpha_{1j}\mathbb{1}\{Site=1\}+\alpha_{2j}(Date-150)+\alpha_{3j}(Date-150)^2\]

The regressor SITE and DATE should be specified in data_nplcm$X as a data frame. Since simulate_nplcm() set the returned data_nplcm$X to NULL, we should manually define data_nplcm$X as required. Further, data_nplcm$X should also contain a column Y recording the case/control status, which should be the same asdata_nplcm$Y. After combining the data of all sites, we need to adjust the order of subjects proporly to make sure that the case group is on top of the control group, as required by the model fitting function nplcm().

J = 3                          # number of causes
cause_list = c(LETTERS[1:J])   # cause list
N.SITE = 2                     # number of sites
K = 2                          # number of subclasses
lambda = c(1,0)                # subclass weights for control group
eta = c(1,0)                   # subclass weights for case group
Nd = 300                       # number of subjects for case group
Nu = 200                       # number of subjects for control group


# etiology regression simulation
Eti.reg.paraMat.list = list(c(3,-1,0,(-2+1)/150^2),c(4,1,0,(-5+2)/150^2),c(1,0,0,0))   
compute_Eti_given_date <- function(siteID,date){
  Phi = sapply(Eti.reg.paraMat.list, function(para) 
    return(exp(para[1]+para[2]*(siteID==1)+para[3]*(date-150)+para[4]*(date-150)^2)))
  Eti = Phi/sum(Phi)
  return(Eti)
}

# FPR regression simulation
FPR.reg.paraMat.list = list(c(0.35,-0.05,0,-0.1/150^2),c(0.35,-0.05,0,-0.1/150^2),c(0.2,-0.05,0,-0.05/150^2))
compute_FPR_given_date <- function(siteID,date) {
  return(sapply(FPR.reg.paraMat.list, function(para) return(para[1]+para[2]*(siteID==1)+para[3]*(date-150)+para[4]*(date-150)^2)))
}


set.seed(20181018)
data_nplcm_list = lapply(1:N.SITE,function(siteID){
  
  fulldata_list_perSite = lapply(1:(Nd+Nu), function(caseID){
    date = sample(1:300,1)
    set_parameter <- list(
      cause_list      = cause_list,
      etiology        = compute_Eti_given_date(siteID,date), 
      pathogen_BrS    = LETTERS[1:J],
      SS = T,
      pathogen_SS     = LETTERS[1:2],
      meas_nm         = list(MBS = c("MBS1"),MSS=c("MSS1")),
      Lambda          = lambda,         # for BrS   
      Eta             = t(replicate(J,eta)),  # case subclass weight for BrS
      PsiBS           = cbind(compute_FPR_given_date(siteID,date),
                              compute_FPR_given_date(siteID,date)), # FPR
      PsiSS           = cbind(rep(0,J),rep(0,J)),
      ThetaBS         = cbind(c(0.95,0.9,0.85),    # TPR
                              c(0.95,0.9,0.85)),
      ThetaSS         = cbind(c(0.25,0.10,0.15),
                              c(0.25,0.10,0.15)),
      Nu      =     1, 
      Nd      =     1 
    )
    simu_out   <- simulate_nplcm(set_parameter)
    out <- simu_out$data_nplcm
    out$X = data.frame(SITE=rep(siteID,2),DATE=rep(date,2))
    return(out)
  })
  
  # extract cases for the first Nd data lists and extract controls for the last Nu data lists
  data_perSite = list(
    Mobs=list(
      MBS = list(
        MBS1 = Reduce(rbind, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$Mobs$MBS$MBS1[1,]),
                               lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$Mobs$MBS$MBS1[2,])))),
      MSS = list(
        MSS1 = Reduce(rbind, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$Mobs$MSS$MSS1[1,]),
                               lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$Mobs$MSS$MSS1[2,])))),
      MGS = NULL),
    Y = Reduce(c, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$Y[1]),
                    lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$Y[2]))),
    X = Reduce(rbind, c(lapply(fulldata_list_perSite[1:Nd], function(l) l$X[1,]),
                        lapply(fulldata_list_perSite[(Nd+1):(Nd+Nu)], function(l) l$X[2,]))))
  
  return(data_perSite)
})


# put cases on top of controls
data_nplcm_order = Reduce(c,lapply(1:N.SITE, function(i) ((i-1)*(Nd+Nu)+1):((i-1)*(Nd+Nu)+Nd)))
data_nplcm_order = c(data_nplcm_order, setdiff(1:((Nd+Nu)*N.SITE),data_nplcm_order))
data_nplcm = list(
  Mobs = list(
    MBS = list(MBS1 = Reduce(rbind, lapply(data_nplcm_list, function(l) l$Mobs$MBS$MBS1))[data_nplcm_order,]),
    MSS = list(MSS1 = Reduce(rbind, lapply(data_nplcm_list, function(l) l$Mobs$MSS$MSS1))[data_nplcm_order,]),
    MGS=NULL),
  Y = Reduce(c, lapply(data_nplcm_list, function(l) l$Y))[data_nplcm_order],
  X = Reduce(rbind, lapply(data_nplcm_list, function(l) l$X))[data_nplcm_order,])
# add the column Y recording case/control status
data_nplcm$X=cbind(data_nplcm$X,data.frame(Y=data_nplcm$Y))  

Data Exploration

We can visulize the seasonality trend of the positive rate of a given pathogen in MBS1 by calling the function visualize_season().For a selected pathogen, (pathogen B in the following example), it plots the conditional probability \(P(M_i^B=1|t_i)\) against date \(t_i\), separated by case group (in black) and control group (in blue), where the bars record \(M_i^B\). Further, the solid line is a fitted logistic regression line to model the trend. Note that the function uses data_nplcm$X$ENRLDATE as date by defualt, so we need to make sure that this column exists.

ENRLDATE.seq = seq(as.Date('2010-01-06'),as.Date('2010-11-01'),by=1) # 300 days
data_nplcm$X$ENRLDATE = ENRLDATE.seq[data_nplcm$X$DATE]
visualize_season(data_nplcm,"B")

Model Specification

model_options includes the following three aspects.

BrS_object_1 <- make_meas_object(LETTERS[1:J],"MBS","1","BrS",cause_list)
SS_object_1 <- make_meas_object(LETTERS[1:2],"MSS","1","SS",cause_list)

model_options <- list(likelihood   = list(cause_list = cause_list,        # <---- fitted causes.
                                   k_subclass = c(1),                     # <---- no. of subclasses
                                   Eti_formula = ~ s_date_Eti(DATE,Y,basis='ncs',3) +as.factor(SITE),               
                                   FPR_formula = list(
                                     MBS1 =  ~ s_date_FPR(DATE,Y,basis = "ps",3) +as.factor(SITE)
                                   )),  
               use_measurements = c("BrS","SS"),  # <---- which measurements to use to inform etiology
               prior  = list(
                    Eti_prior  = 1, # <--- etiology prior.
                    TPR_prior   = list(
                      BrS  = list(info  = "informative",
                                  input = "match_range",
                                  val   = list(
                                    MBS1 = list(up = list(rep(0.99,length(BrS_object_1$patho))),
                                                 low = list(rep(0.5,length(BrS_object_1$patho))) 
                                    ))
                      ),
                      SS   = list(info  = "informative",
                                  input = "match_range",
                                  val   = list(
                                    MSS1 = list(up = list(rep(0.5,length(SS_object_1$patho))),
                                                low = list(rep(0.01,length(SS_object_1$patho))) 
                                    ))
                      )
                    )  # <---- TPR prior.
)
)     

assign_model(model_options,data_nplcm)
## $num_slice
## MBS MSS MGS 
##   1   1   0 
## 
## $nested
## [1] FALSE
## 
## $regression
## $regression$do_reg_Eti
## [1] TRUE
## 
## $regression$do_reg_FPR
## $regression$do_reg_FPR$MBS1
## [1] TRUE
## 
## 
## $regression$is_discrete_predictor
## $regression$is_discrete_predictor$Eti
## [1] FALSE
## 
## $regression$is_discrete_predictor$FPR
##  MBS1 
## FALSE 
## 
## 
## 
## $BrS_grp
## [1] FALSE
## 
## $SS_grp
## [1] FALSE

Model Fitting

Because we use JAGS for Bayesian inference of the models, we need to provide .bug files with model likelihood and prior. The package baker can interpret the specified models and write them in .bug files. In this case, baker uses write_model_Reg_NoNest() to write .bug file for conditional independence models with discrete regression, accommodating multiple bronze-standard and silver-standard data.

# parent directory for testing code (LOCAL):
working_dir <- tempdir()

# date stamp for analysis:
Date     <- gsub("-", "_", Sys.Date())
# include stratification information in file name:
dated_strat_name    <- paste0(working_dir,Date,"_continuous_predictor")
# create folder
result_folder <- dated_strat_name
dir.create(result_folder)


# options for MCMC chains:
mcmc_options <- list(debugstatus = TRUE,
                     n.chains   = 1,
                     n.itermcmc = 10000,
                     n.burnin   = 5000,
                     n.thin     = 10,
                     individual.pred = !TRUE,
                     ppd             = !TRUE,
                     get.pEti        = !TRUE,
                     result.folder = result_folder,
                     bugsmodel.dir = result_folder,
                     jags.dir = "",
                     use_jags = TRUE)

# Record the settings of current analysis:
dput(data_nplcm,file.path(mcmc_options$result.folder,"data_nplcm.txt")) # <-- in case covariate data X are needed for plotting.

rjags::load.module("glm")

gs <- nplcm(data_nplcm,model_options,mcmc_options)

Posterior Diagnosis

The file jagsdata.txt contains all data and parameters used when fitting the model, which is saved in bugs.dat as follows. The files CODAchain1.txt and CODAindex.txt together records the posterior samples for:

The posterior samples can be extracted by coda::read.coda and saved in res_nplcm. The function print_res prints the traceplot for selected series. For instance, the following code prints the traceplots for betaEti.

# JAGS:
DIR_NPLCM <- result_folder
new_env   <- new.env()
source(file.path(DIR_NPLCM,"jagsdata.txt"),local=new_env)
bugs.dat <- as.list(new_env)
rm(new_env)
res_nplcm <- coda::read.coda(file.path(DIR_NPLCM,"CODAchain1.txt"),
                             file.path(DIR_NPLCM,"CODAindex.txt"),
                             quiet=TRUE)

get_res   <- function(x) res_nplcm[,grep(x,colnames(res_nplcm))]
print_res <- function(x) for (i in grep(x,colnames(res_nplcm))) plot(res_nplcm[,i],main=paste0('Trace of ',colnames(res_nplcm)[i]))

print_res("betaEti")

We can recover posterior etiology probabilities from posterior betaEti. Note that the transformed design matrix for Etiology regression has been saved in Z_Eti (which is derived by sourcing jagsdata.txt). First, we structure posterior betaEti as an array of dimension #posterior samples * #parameters * J. For each posterior sample, posterior etiology probabilities for each case can be calculated as normalized exp(Z_Eti%*%betaEti). Further, posterior Etiology mean and quantiles for each case can be calculated by marginalizing over all posterior samples.

# structure the posterior samples:
JBrS_1        <- ncol(bugs.dat$MBS_1)
n_samp_kept   <- nrow(res_nplcm)
Jcause        <- bugs.dat$Jcause
Nd            <- bugs.dat$Nd
Nu            <- bugs.dat$Nu
n_unique_Eti_level <- bugs.dat$n_unique_Eti_level

betaEti_post <- array(get_res("betaEti"),c(n_samp_kept,5,Jcause)) #   of dimension (#samples * #paras * J)
Phi_post <- array(t(apply(betaEti_post,1,function(beta) exp(Z_Eti %*% beta))),c(n_samp_kept,dim(Z_Eti)[1],Jcause)) # of dimension (#samples * #cases * J)
Eti_post <- apply(Phi_post,c(1,2), function(Phi) Phi/sum(Phi)) # of dimension(J * #samples * #cases)
Eti_post_mean <- apply(Eti_post, c(1,3), mean)  # of dimension(J * #cases)
Eti_post_q <- apply(Eti_post,c(1,3),quantile,c(0.025,0.975)) # # of dimension(2 * J * #cases)

Further, we can plot posterior etiology probabilities against date and compare it with our simulation assumption. In the following plot, red lines are for Site 1 while green lines are for Site 2. The solid lines represents the true relationship between date and Etiology, which we formulated in data simulation. The dots plot posterior Eiology mean against date and the boundary of the shadow area correponds to 2.5% and 97.5% posterior quantiles. From the plot, the natural cubic spline regression with degree of freedom 3 can roughly reproduce the quadratic additive formulation.

library(ggplot2)
library(dplyr)

plot_Eti_posterior <- function(EtiIndex) {
  tmp = data.frame(Date = data_nplcm$X$DATE[1:600],
                   Site = data_nplcm$X$SITE[1:600],
                   Eti_mean = Eti_post_mean[EtiIndex,],
                   Eti_lq = Eti_post_q[1,EtiIndex,],
                   Eti_hq = Eti_post_q[2,EtiIndex,]) 
  tmp = tmp%>% 
    mutate(Eti_true = sapply(1:600, function(i) compute_Eti_given_date(tmp$Site[i],tmp$Date[i])[EtiIndex])) 
  p<- ggplot(tmp) +
    geom_point(aes(Date,Eti_mean,color=as.factor(Site)),size=0.5)+
    geom_ribbon(aes(x=Date,ymin=Eti_lq,ymax=Eti_hq,color=as.factor(Site)),alpha=0.3)+
    geom_line(aes(Date,Eti_true,color=as.factor(Site)),size=1)
  return(p)
}

gridExtra::grid.arrange(grobs=lapply(1:Jcause, plot_Eti_posterior),nrow=3,ncol=1)